%load_ext autoreload
%autoreload 2
%matplotlib notebook
import datetime
import sys
from typing import Iterable
sys.path.append("../../..")
import iso3166
import matplotlib
from matplotlib import pyplot
import pandas as pd
matplotlib.rcParams["figure.figsize"] = (9.5, 10.0)
matplotlib.rcParams["figure.max_open_warning"] = 100
import netanalysis.traffic.data.model as traffic
from netanalysis.traffic.data.file_repository import FileTrafficRepository
import netanalysis.traffic.analysis.find_anomalies as fa
from netanalysis.infrastructure.resources import resource_filename
PRODUCT_LIST = [
traffic.ProductId.WEB_SEARCH, traffic.ProductId.MAPS, traffic.ProductId.IMAGES,
traffic.ProductId.YOUTUBE, traffic.ProductId.BLOGGER, traffic.ProductId.SITES,
traffic.ProductId.GMAIL, traffic.ProductId.GROUPS,
traffic.ProductId.TRANSLATE,
traffic.ProductId.SPREADSHEETS, traffic.ProductId.DOCS,
traffic.ProductId.EARTH
]
repo = FileTrafficRepository(resource_filename("traffic_data"))
def plot_expectations(time_series, expectations):
time_series.plot()
expectations.expected.plot(linewidth=1)
pyplot.fill_between(time_series.index, expectations.lower_bound, expectations.upper_bound, alpha=0.3, linewidth=0)
anomalous_dates = (time_series < expectations.lower_bound).loc[lambda e: e].index
if not anomalous_dates.empty:
time_series[anomalous_dates].plot(style='ro')
def show_region_traffic(region_code, product_ids) -> None:
fig = pyplot.figure()
for index, product_id in enumerate(product_ids):
hires_traffic = repo.get_traffic(region_code, product_id)
if hires_traffic.empty:
continue
time_series = hires_traffic.resample("D").mean()
expectations = fa.get_expectations_1(time_series)
axes = fig.add_subplot(len(product_ids), 1, index + 1)
axes.set_ylabel(product_id.name)
axes.set_ylim(bottom=0, top=time_series.max()*1.1)
plot_expectations(time_series["2016":], expectations["2016":])
fig.show()
def show_product_traffic(product_id: traffic.ProductId, regions: Iterable[str]) -> None:
fig = pyplot.figure()
for index, region_code in enumerate(regions):
hires_traffic = repo.get_traffic(region_code, product_id)
if hires_traffic.empty:
continue
time_series = hires_traffic.resample("D").mean()
expectations = fa.get_expectations_1(time_series)
axes = fig.add_subplot(len(regions), 1, index + 1)
axes.set_ylabel(region_code)
axes.set_ylim(bottom=0, top=time_series.max()*1.1)
plot_expectations(time_series["2016":], expectations["2016":])
fig.show()
# INTERESTING_REGIONS = [
# "DZ", "BY", "CM", "CD", "EG", "ET", "GA", "GM", "IN", "IR", "IQ", "PK", "SA", "SY", "TG", "TR", "UA", "VN",
# "PR", "VI", "TC", "US", "ZW"
# ]
INTERESTING_REGIONS = repo.list_regions()
all_disruptions = fa.find_all_disruptions(repo, INTERESTING_REGIONS, [traffic.ProductId.YOUTUBE, traffic.ProductId.WEB_SEARCH])
all_disruptions.sort(reverse=True, key=lambda d: (d.start, d.end))
def show_disruption(region_disruption):
num_product_disruptions = len(region_disruption.product_disruptions)
num_columns = 3
num_rows = (num_product_disruptions + 1) / 2
fig = pyplot.figure(figsize=(num_columns * 3.2, num_rows * 2.5))
fig.tight_layout()
fig.suptitle("%s %s - %s" % (iso3166.countries.get(region_disruption.region_code).name, region_disruption.start.date(), region_disruption.end.date()))
end_date = region_disruption.end + datetime.timedelta(days=1)
duration = end_date - region_disruption.start
chart_padding = duration * 2
chart_start_date = region_disruption.start - chart_padding
chart_end_date = min(end_date + chart_padding, datetime.datetime.now())
for index, product_disruption in enumerate(region_disruption.product_disruptions):
chart_traffic = repo.get_traffic(
region_disruption.region_code, product_disruption.product_id)[chart_start_date:chart_end_date]
axes = fig.add_subplot(num_rows, num_columns, index + 1)
axes.set_ylabel(product_disruption.product_id.name)
axes.set_ylim(bottom=0, top=chart_traffic.max()*1.1)
axes.plot(chart_traffic)
axes.axvspan(region_disruption.start, region_disruption.end, alpha=0.2, color='grey')
fig.show()
internet_shutdowns = [rd for rd in all_disruptions if len(rd.product_disruptions) >= 2]
print("Found %s shutdowns" % len(internet_shutdowns))
for region_disruption in internet_shutdowns:
show_disruption(region_disruption)
fa.print_disruption_csv(region_disruption)
Below are all the outages detected in Ethiopia
for region_disruption in all_disruptions:
if region_disruption.region_code != "ET": continue
show_disruption(region_disruption)
fa.print_disruption_csv(region_disruption)
In the example below, we can see shutdowns in July, August and October 2016, and end of May 2017.
show_region_traffic("ET", [traffic.ProductId.WEB_SEARCH, traffic.ProductId.YOUTUBE,
traffic.ProductId.BLOGGER, traffic.ProductId.TRANSLATE,
traffic.ProductId.SITES])
It's possible to use the traffic data to detect product blocking. In the charts below, you can see exactly when YouTube was blocked in Iran, Uzbequistan, and when Ethiopia blocked YouTube and Blogger.
show_product_traffic(traffic.ProductId.YOUTUBE, ["IR", "UZ"])
show_region_traffic("ET", [traffic.ProductId.YOUTUBE, traffic.ProductId.BLOGGER])
In October 2016 there was a big bump in YouTube traffic in the US, which seems to have affected the numbers for other countries.
It may be that traffic for other countries started being geolocated in the US. Furthermore, traffic numbers are relative to global traffic, and US traffic dominates the global traffic for some products. Increasing it significantly can reduce the traffic numbers for other countries.
show_product_traffic(traffic.ProductId.YOUTUBE, [
"IR", "IQ", "PK", "SA", "SY", "TR", "VN", "PR", "TC", "US"
])